!pip install mplleaflet
!pip3 install https://github.com/matplotlib/basemap/archive/master.zip
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import matplotlib.cm as cm
import warnings
warnings.filterwarnings('ignore')
There are six csv files of data on Uber pickups in New York City from April to September 2014. Each file contains data for a particular month and each has the following columns:
!curl -O https://transfer.sh/jPMAEq/Dataset.zip
!unzip "/content/Dataset.zip" -d "/content/"
import pandas as pd
April = pd.read_csv("/content/Apr.csv")
May = pd.read_csv("/content/May.csv")
June = pd.read_csv("/content/Jun.csv")
July = pd.read_csv("/content/Jul.csv")
Aug = pd.read_csv("/content/Aug.csv")
Sep = pd.read_csv("/content/Sep.csv")
Data = pd.DataFrame()
Files = [April, May, June, July, Aug, Sep]
for file in Files:
Data = pd.concat([Data, file])
Date/Time: The date and time of the Uber pickup
Lat(Latitude): The latitude of the Uber pickup
Lon(Longitude): The longitude of the Uber pickup
Base: The TLC base company code affiliated with the Uber pickup.
Descriptive statistics analysis helps to describe the basic features of dataset and obtain a brief summary of the data.
The describe() method in Pandas library helps us to have a brief summary of the dataset.
It automatically calculates basic statistics for all numerical variables excluding NaN (we will come to this part later) values.
Pandas head() method is used to return top n (5 by default) rows of a data frame or series.
Data.head() # Display First 5 Records
The info() function is used to print a concise summary of a DataFrame.
This method prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage.
Data.info()
Pandas describe() is used to view some basic statistical details like percentile, mean, std etc. of a data frame or a series of numeric values.
Data.describe()
The dtypes property is used to find the dtypes in the DataFrame.
This returns a Series with the data type of each column.
The result's index is the original DataFrame's columns.
Columns with mixed types are stored with the object dtype.
Data.dtypes
The shape property returns a tuple representing the dimensionality of the DataFrame.
The format of shape would be (rows, columns)
Rows = Data.shape[0] # No of Rows
Columns = Data.shape[1] # No of Columns
print("Rows :", Rows)
print("Columns :", Columns)
Column_Names = Data.columns # Column Names
Heatmaps visualize the data in a 2-dimensional format in the form of colored maps.
The color maps use hue, saturation, or luminance to achieve color variation to display various details.
This color variation gives visual cues to the readers about the magnitude of numeric values.
HeatMaps is about replacing numbers with colors because the human brain understands visuals better than numbers, text, or any written data.
Heatmaps can describe the density or intensity of variables, visualize patterns, variance, and even anomalies.
Heatmaps show relationships between variables.
These variables are plotted on both axes. We look for patterns in the cell by noticing the color change.
# To Check missing value
import seaborn as sn
sn.heatmap(Data.isnull(), cbar=False, yticklabels=False, cmap='viridis');
Pandas provides functions to check the number of missing values in the dataset.
Missingno library takes it one step further and provides the distribution of missing values in the dataset by informative visualizations.
Using the plots of missingno, we are able to see where the missing values are located in each column and if there is a correlation between missing values of different columns.
Before handling missing values, it is very important to explore them in the dataset.
import missingno as msno
msno.bar(Data, figsize=(5, 4));
msno.heatmap(Data, figsize=(5, 5));
# Find the Number of Rows that has Nan Value in it
Data.isnull().sum()
# Count the No of Non NA cells for each column or row
Data.count()
# Find the Number of Rows that has Nan Value in it
Null_Data = Data.isnull().sum()
Null_Columns = [] # List for storing the Null Column Names
for i in range(len(Null_Data)):
# If the number of Null Values in the Row is equal to the total number of Records, then it means that the whole column contains NUll value in it.
if Null_Data[i] == Rows - 1 or Null_Data[i] == Rows:
Null_Columns.append(Column_Names[i])
print(Null_Columns)
It's evident that there is no column in the dataset which has only NULL values.
# Delete all NULL Columns which has only NULL values
for i in Null_Columns:
del Data[i]
Data
Data.isnull().any()
Data.isnull().sum()
# Display the Rows which has one or more NULL values in it
Data[Data.isnull().any(axis=1)]
Data.dropna(inplace=True)
Data.isnull().any()
print(Data.isnull().sum())
Data.shape
# Check if there is any Duplicate Rows
duplicate = Data[Data.duplicated()]
print("Number of Duplicate rows: ", duplicate.shape)
Data.count()
# Drop all the Duplicate Rows
Data = Data.drop_duplicates()
Data.count()
len(Data)
Data.describe(include=["O"])
Data['Date/Time'] = pd.to_datetime(Data['Date/Time'])
from datetime import datetime as dt
Data["Year"] = Data['Date/Time'].dt.year
Data["Day"]=Data['Date/Time'].dt.day
Data["Month"]=Data['Date/Time'].dt.month
Data["Hour"]=Data['Date/Time'].dt.hour
Data["Minute"]=Data['Date/Time'].dt.minute
Data["Second"]=Data['Date/Time'].dt.second
Data.head()
# Weekdays as a Tuple
WeekDays = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
Data["Week_Day_Int"] = Data['Date/Time'].dt.weekday
Data["Week_Day_String"] = Data['Date/Time'].dt.day_name()
# Find the Number of Months
Data.Month.unique().tolist()
Month = Data.groupby("Month").count()
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
sns.set(style="whitegrid")
sns.barplot(x = Month.index, y = Month.Lat);
Monthly_uber_rides = Data.pivot_table(index=['Month'], values='Base', aggfunc='count')
Monthly_uber_rides.plot(kind='bar',figsize=(8,6))
plt.ylabel('Total Journeys')
plt.title('Months');
Day = Data.groupby("Day").count()
plt.figure(figsize=(12,5))
sns.set(style="whitegrid")
ax = sns.barplot(x = Day.index, y = Day.Lat)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right",fontsize=10)
plt.tight_layout()
plt.show()
sns.set(rc={'figure.figsize':(10, 5)})
sns.distplot(Data["Day"]);
Week_Day = Data.groupby("Week_Day_Int").count()
plt.figure(figsize=(12,5))
sns.set(style="whitegrid")
ax = sns.barplot(x = Week_Day.index, y = Week_Day.Lat)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right",fontsize=12)
plt.tight_layout()
plt.show()
sns.distplot(Data["Week_Day_Int"])
Daily_uber_rides = Data.pivot_table(index=['Week_Day_Int','Week_Day_String'], values='Base', aggfunc='count')
Daily_uber_rides.plot(kind='bar',figsize=(8, 8))
plt.ylabel('Total Journeys')
plt.title('Days of the Week');
Daily_uber_rides_month = Data.groupby(['Month','Week_Day_Int','Week_Day_String'])['Base'].count()
Daily_uber_rides_month = Daily_uber_rides_month.reset_index()
sns.set_style('darkgrid')
ax = sns.pointplot(x="Week_Day_String", y="Base", hue="Month", data = Daily_uber_rides_month)
handles, labels = ax.get_legend_handles_labels()
ax.set_xlabel('Day of Week', fontsize = 15)
ax.set_ylabel('Total Uber Pickups', fontsize = 15)
ax.set_title('Total Number of Pickups for Each Weekday per Month (April-September 2014)', fontsize=16)
ax.tick_params(labelsize = 8)
ax.legend(handles,labels,loc=0, title="Months", prop={'size':10})
ax.get_legend().get_title().set_fontsize('8')
plt.show()
Hour = Data.groupby("Hour").count()
sns.set(style = "whitegrid")
plt.figure(figsize=(12,8))
sns.barplot( x = Hour.index, y = Hour.Lat);
sns.distplot(Data["Hour"]);
fig = plt.figure(figsize=(12,6));
Uber_hour = Data.pivot_table(index=['Hour'], values='Base', aggfunc='count')
Uber_hour.plot(kind='bar', figsize=(8,6))
plt.ylabel('Total Journeys')
plt.title('Journeys by Hour');
## Groupby operation
Hourly_ride_data = Data.groupby(['Month','Week_Day_Int','Hour','Week_Day_String'])['Base'].count()
## Reset index
Hourly_ride_data = Hourly_ride_data.reset_index()
## Rename column
Hourly_ride_data = Hourly_ride_data.rename(columns = {'Base':'RideCount'})
## Ocular analysis
Hourly_ride_data.head()
## Groupby Operation
Weekday_hourly_avg = Hourly_ride_data.groupby(['Week_Day_String','Hour'])['RideCount'].mean()
## Reset index
Weekday_hourly_avg = Weekday_hourly_avg.reset_index()
## Rename column
Weekday_hourly_avg = Weekday_hourly_avg.rename(columns = {'RideCount':'AverageRides'})
## Sort by categorical index
Weekday_hourly_avg = Weekday_hourly_avg.sort_index()
## Ocular analysis
Weekday_hourly_avg.head()
Weekday_hourly_avg.Week_Day_String.value_counts()
# Convert AverageRides column from float to int
Weekday_hourly_avg['AverageRides'] = Weekday_hourly_avg['AverageRides'].astype('int')
Weekday_hourly_avg['AverageRides'].dtypes
Weekday_hourly_avg.head()
Heatmap = pd.pivot_table(Weekday_hourly_avg, values='AverageRides', index=['Week_Day_String'], columns='Hour')
plt.figure(figsize=(16,10));
sns.heatmap(Heatmap);
fig = plt.figure(figsize=(12,6))
sns.set_style('darkgrid')
ax = sns.pointplot(x = "Hour", y = "AverageRides", hue = "Week_Day_String", data = Weekday_hourly_avg)
handles, labels = ax.get_legend_handles_labels()
handles = [handles[1], handles[5], handles[6], handles[4], handles[0], handles[2], handles[3]]
labels = [labels[1], labels[5], labels[6], labels[4], labels[0], labels[2], labels[3]]
ax.set_xlabel('Hour of Day', fontsize = 15)
ax.set_ylabel('Average Uber Pickups', fontsize = 15)
ax.set_title('Hourly Average Uber Pickups By Day of the Week in NYC (April-September 2014)', fontsize=16)
ax.tick_params(labelsize = 8)
ax.legend(handles,labels,loc=0, title="Days", prop={'size':10})
ax.get_legend().get_title().set_fontsize('8')
plt.show()
Data.Base.value_counts()
Base = Data.groupby("Base").count()
plt.figure(figsize=(12,5))
sns.set(style="whitegrid")
ax = sns.barplot(x = Base.index, y = Base.Lat)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right",fontsize=12)
plt.tight_layout()
plt.show()
Base = {"Base": {'B02617':'Weiter', 'B02598':'Hinter','B02682':'Schmecken','B02764':'Danach-NY','B02512':'Unter'}}
Uber_2014_Bases = Data.copy()
Uber_2014_Bases.replace(Base, inplace=True)
Uber_2014_Bases.head()
Uber_2014_Bases.shape
Uber_2014_Bases1 = Uber_2014_Bases.iloc[:2551746,:]
import plotly.express as px
fig = px.histogram(Uber_2014_Bases1, x = "Base", title='Pickups Per Base', labels={'count':'No of Pickups'})
fig.show()